In [1]:
% matplotlib inline
import pandas as pd
from dateutil.relativedelta import relativedelta
import statsmodels.formula.api as sm
import requests
import pickle
from user_object import User

Feature extraction

Our measures of user activity over a time span include:

  1. number of edits in all namespaces
  2. number of days active (a user is active on a day if they make at least on edit in any namespace)
  3. number of edit sessions (an edit session is a sequence of edits without a gap of 60 minutes or more)
  4. indicator of whether the user made at least one edit in any namespace

Our measures of harassment received/made over a time span are:

  1. number of a comments received/made that classifier clf scored above threshold
  2. number of a comments received/made that scored above threshold for any of our 3 harassment classifers
  3. indicator of whether the user received/made at least one comment that scored above threshold for any of our 3 harassment classifiers

We also gather:

  1. each users gender
  2. and the number of user warnings the editor received

As mentioned above we, gather activity and harassment features for newcomers in timespan t1 and see how they correlate with activity features in timespan t2.

In the following analysis, the two time spans we are interested in are the first and second month after user registration.


In [2]:
def select_month_since_start(user,  activity, t):
    start = user.first_edit_day + relativedelta(months=(t-1))
    stop = user.first_edit_day + relativedelta(months= t)
    activity = activity[activity['timestamp'] < stop]
    activity = activity[activity['timestamp'] >= start]
    return activity

def count_edits(user, t):
    activity = user.df_activity
    if activity is None:
        return 0
    activity = select_month_since_start(user,  activity, t)
    return activity['n_revisions'].sum()

def count_ns0_revisions(user, t):
    activity = user.df_activity
    if activity is None:
        return 0
    activity = select_month_since_start(user,  activity, t)
    activity = activity.query("ns=='0'")
    return activity['n_revisions'].sum()


def count_days_active(user, t):
    activity = user.df_activity
    if user.df_activity is None:
        return 0
    activity = select_month_since_start(user,  activity, t)
    return len(activity.timestamp.unique())

def count_score_received_above_threshold(user, score, threshold, t):
    if user.df_comments_to is None:
        return 0
    
    comments = user.df_comments_to
    comments = select_month_since_start(user,  comments, t)
    return (comments[score] > threshold).sum()

def count_score_made_above_threshold(user, score, threshold, t):
    if user.df_comments_from is None:
        return 0
    
    comments = user.df_comments_from
    comments = select_month_since_start(user,  comments, t)
    return (comments[score] > threshold).sum()

def is_female(u):
    return int(u.gender == 'female')

def is_male(u):
    return int(u.gender == 'male')

def count_warnings_received(user, t):
    warnings = user.df_uw
    if warnings is None:
        return 0
    warnings = select_month_since_start(user, warnings, t)
    return len(warnings)

def count_fraction_of_ns0_revisions_x(user, x, t):
    
    if user.df_activity is None:
        return 0
    
    activity = user.df_activity.query("ns=='0'")
    activity = select_month_since_start(user,  activity, t)
        
    if activity['n_revisions'].sum() < 1:
        return 0
    
    return  float(activity[x].sum()) / activity['n_revisions'].sum()

In [3]:
feature_map = {
    'first_edit_day' : lambda u: u.first_edit_day,
    'm1_num_ns0_edits' : lambda u: count_ns0_revisions(u, 1),
    'user_id' : lambda u : u.user_id,
    'is_female' : is_female,
    'is_male' : is_male,
    'has_gender' : lambda u: int(is_female(u) or is_male(u)),
    'm1_num_edits' : lambda u: count_edits(u, 1) ,
    'm2_num_edits' : lambda u: count_edits(u, 2),
    'm1_num_days_active' : lambda u: count_days_active(u, 1),
    'm2_num_days_active' : lambda u: count_days_active(u, 2),
    'm1_num_warnings_recieved' : lambda u: count_warnings_received(u, 1),
    'm1_fraction_ns0_deleted' : lambda u: count_fraction_of_ns0_revisions_x(u, 'n_deleted_revisions', 1) ,
    'm1_fraction_ns0_reverted' : lambda u: count_fraction_of_ns0_revisions_x(u, 'n_identity_reverted_revisions', 1) ,
    'm1_fraction_ns0_productive' : lambda u:  count_fraction_of_ns0_revisions_x(u, 'n_productive_revisions', 1) ,
    'm1_active' : lambda u: int(count_edits(u, 1) > 0),
    'm2_active' : lambda u: int(count_edits(u, 2) > 0),

}

        
        
feature_map['m1_num_%s_received_%.3f' % ('aggression', 0.01)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'aggression',  0.01, 1)
feature_map['m1_num_%s_made_%.3f' % ('aggression', 0.01)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'aggression',  0.01, 1)
feature_map['m1_num_%s_received_%.3f' % ('aggression', 0.425)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'aggression',  0.425, 1)
feature_map['m1_num_%s_made_%.3f' % ('aggression', 0.425)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'aggression',  0.425, 1)
feature_map['m1_num_%s_received_%.3f' % ('aggression', 0.75)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'aggression',  0.75, 1)
feature_map['m1_num_%s_made_%.3f' % ('aggression', 0.75)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'aggression',  0.75, 1)
feature_map['m1_num_%s_received_%.3f' % ('aggression', 0.85)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'aggression',  0.85, 1)
feature_map['m1_num_%s_made_%.3f' % ('aggression', 0.85)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'aggression',  0.85, 1)


feature_map['m1_num_%s_received_%.3f' % ('attack', 0.01)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'attack',  0.01, 1)
feature_map['m1_num_%s_made_%.3f' % ('attack', 0.01)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'attack',  0.01, 1)
feature_map['m1_num_%s_received_%.3f' % ('attack', 0.425)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'attack',  0.425, 1)
feature_map['m1_num_%s_made_%.3f' % ('attack', 0.425)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'attack',  0.425, 1)
feature_map['m1_num_%s_received_%.3f' % ('attack', 0.75)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'attack',  0.75, 1)
feature_map['m1_num_%s_made_%.3f' % ('attack', 0.75)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'attack',  0.75, 1)
feature_map['m1_num_%s_received_%.3f' % ('attack', 0.85)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'attack',  0.85, 1)
feature_map['m1_num_%s_made_%.3f' % ('attack', 0.85)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'attack',  0.85, 1)


feature_map['m1_num_%s_received_%.3f' % ('toxicity', 0.01)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'toxicity',  0.01, 1)
feature_map['m1_num_%s_made_%.3f' % ('toxicity', 0.01)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'toxicity',  0.01, 1)
feature_map['m1_num_%s_received_%.3f' % ('toxicity', 0.425)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'toxicity',  0.425, 1)
feature_map['m1_num_%s_made_%.3f' % ('toxicity', 0.425)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'toxicity',  0.425, 1)
feature_map['m1_num_%s_received_%.3f' % ('toxicity', 0.75)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'toxicity',  0.75, 1)
feature_map['m1_num_%s_made_%.3f' % ('toxicity', 0.75)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'toxicity',  0.75, 1)
feature_map['m1_num_%s_received_%.3f' % ('toxicity', 0.85)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'toxicity',  0.85, 1)
feature_map['m1_num_%s_made_%.3f' % ('toxicity', 0.85)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'toxicity',  0.85, 1)

In [4]:
random_user_objects = pickle.load(open("../../data/retention/random_user_data.pkl", "rb"))

In [5]:
d = {k : [v(u) for u in random_user_objects] for k,v in feature_map.items()}
df_features = pd.DataFrame(d)
df_features.index = df_features.user_id
del df_features['user_id']
print(df_features.shape)
df_active = df_features.query('m1_active == 1')
print(df_active.shape[0])
df_active.to_csv("../../data/retention/random_user_sample_features.csv")


(100000, 39)
100000

In [6]:
attacked_user_objects = pickle.load(open("../../data/retention/attacked_user_data.pkl", "rb"))

In [7]:
d = {k : [v(u) for u in attacked_user_objects] for k,v in feature_map.items()}
df_features = pd.DataFrame(d)
df_features.index = df_features.user_id
del df_features['user_id']
print(df_features.shape)
df_active = df_features.query('m1_active == 1')
print(df_active.shape[0])
df_active.to_csv("../../data/retention/attacked_user_sample_features.csv")


(27690, 39)
27690